source("../modularity_analysis_functions.R", local = knitr::knit_global())# reading the microbiome data
# working only with Rattus from the three villages
# villages: Andatsakala, Mandena, Sarahandrano
data_asv <- read_csv("../../data/data_processed/microbiome/data_asv_rra0.001_p0.01_th5000_all.csv")# small mammals data
small_mammals <- read_csv("../../data/data_raw/data_small_mammals/Terrestrial_Mammals.csv") %>%
mutate(host_ID = as.numeric(gsub(".*?([0-9]+).*", "\\1", animal_id))) %>%
mutate(age_repro = as_factor(age_repro)) %>%
dplyr::rename(grid = habitat_type) %>%
filter(host_ID %in% data_asv$host_ID)
cat("## Abundance", '\n','\n')g <- small_mammals %>%
count(village,grid) %>%
ggplot(aes(x=grid, y=n)) +
geom_bar(position="dodge", stat="identity") +
theme_bw() +
theme(axis.text = element_text(size = 10, color = 'black', angle = 90, vjust = 0.5, hjust=1), title = element_text(size = 14)) +
labs(x="Land Use", y="Small-Mammals Abundance")
print(g)cat("## Sex ratio", '\n','\n')g <- small_mammals %>%
count(grid, sex) %>%
ggplot(aes(fill=sex, x=grid, y=n)) +
geom_bar(position="fill", stat="identity") +
geom_hline(yintercept = 0.5, linetype = "dashed") +
theme_bw() +
theme(axis.text = element_text(size = 10, color = 'black', angle = 90, vjust = 0.5, hjust=1), title = element_text(size = 14)) +
labs(x="Land Use", y="Proportion")
print(g)cat("## Age ratio", '\n','\n')g <- small_mammals %>%
count(grid, age_repro) %>%
ggplot(aes(fill=age_repro, x=grid, y=n)) +
geom_bar(position="fill", stat="identity") +
theme_bw() +
theme(axis.text = element_text(size = 10, color = 'black', angle = 90, vjust = 0.5, hjust=1), title = element_text(size = 14)) +
labs(x="Land Use", y="Proportion")
print(g)# relative abundance of the 10 most abundant Family of each group
# microbes taxonomy
tax <- read_delim("../../data/data_raw/data_microbiome/ASVs_taxonomy_new.tsv") %>%
dplyr::rename(asv_ID = ASV)
data_asv_tax <- data_asv_final %>%
left_join(tax, by="asv_ID")
cat("## Relative abundance Family", '\n','\n')total_reads_groups <- data_asv_tax %>% distinct(host_ID, asv_core, total_reads) %>% group_by(asv_core) %>% summarise(n_total = sum(total_reads))
most_abu_family <- data_asv_tax %>%
mutate(reads_a = reads*total_reads) %>%
group_by(asv_core, Family) %>%
summarise(n= sum(reads_a)) %>%
left_join(total_reads_groups, by="asv_core") %>%
mutate(n_p = n/n_total)
most_abu_family_8 <- most_abu_family %>%
filter(!is.na(Family)) %>%
ungroup() %>%
slice_max(by = asv_core, order_by = n_p, n = 8) %>%
mutate(p = "top") %>%
select(asv_core, Family, p)
largest_modules <- modules_table_three_groups %>%
group_by(asv_core, host_group) %>%
summarise(n = n_distinct(host_ID)) %>%
ungroup() %>%
slice_max(by = asv_core, order_by = n, n = 5, with_ties = FALSE) %>%
mutate(p = "top")
modules_top <- modules_table_three_groups %>%
left_join(largest_modules, by=c("asv_core","host_group")) %>%
mutate(module = factor(ifelse(is.na(p),"Other",host_group))) %>%
distinct(host_ID,host_group,asv_core, module)
total_reads_modules <- modules_table_three_groups %>%
distinct(host_ID, asv_core, host_group, total_reads) %>%
left_join(modules_top %>% distinct(asv_core,host_group,module), by=c("asv_core","host_group")) %>%
group_by(asv_core, module) %>% summarise(n_total = sum(total_reads))
most_abu_family_p <- data_asv_tax %>% left_join(modules_top, by=c("asv_core","host_ID")) %>%
mutate(reads_a = reads*total_reads) %>%
group_by(asv_core, Family, module) %>%
summarise(n= sum(reads_a)) %>%
left_join(total_reads_modules, by=c("asv_core","module")) %>%
mutate(n_p = n/n_total) %>%
left_join(most_abu_family_8, by=c("asv_core","Family")) %>%
mutate(p = case_when(p=="top" ~ Family,
is.na(Family) ~ ".NA",
is.na(p) ~ ".Other"))
library("RColorBrewer")
colors <- c("grey40","grey80","darkblue","#ffd60a",brewer.pal(n = 12, name = "Paired"))
g <- most_abu_family_p %>%
group_by(asv_core,module, p) %>%
summarise(n_p = sum(n_p)) %>%
ggplot(aes(fill=p, x=module, y=n_p)) +
geom_bar(position="fill", stat="identity") +
facet_wrap(~asv_core, scales = "free_x") +
theme_bw() +
theme(axis.text = element_text(size = 10, color = 'black'), title = element_text(size = 14),
strip.text = element_text(size=12, color = 'black'),
panel.grid = element_blank(), panel.background = element_rect(colour = "black"), legend.key.size = unit(0.5, "cm")) +
scale_fill_manual(values=colors) +
labs(x="Module ID", y="Relative Abundance", fill = "Family")
print(g)cat('\n','\n')# a=modules_table_three_groups %>% filter(asv_core != "Rare") %>% mutate(module = factor(ifelse(host_group==1,1,0))) %>%
# count(host_ID, asv_core, module) %>%
# group_by(asv_core, module) %>%
# summarise(avg_degree = mean(n), median_degree = median(n))
#
# modules_table_three_groups %>% filter(asv_core != "Rare") %>% mutate(module = factor(ifelse(host_group==1,1,0))) %>%
# count(host_ID, asv_core, module) %>%
# ggplot(aes( x=module, y=n)) +
# geom_boxplot()+
# facet_wrap(~asv_core)+
# theme_bw()set.seed(123)
data_asv_mat <- data_asv %>%
# filter(asv_core == "Non-core") %>%
select(host_ID, asv_ID, reads) %>%
spread(asv_ID, reads, fill = 0) %>%
column_to_rownames("host_ID") %>%
as.matrix()
hosts <- small_mammals %>%
filter(host_ID %in% rownames(data_asv_mat)) %>%
select(host_ID,village,grid,season,month,mass, sex,age_repro,age_dental,elevation.obs) %>%
mutate(village=as.factor(village),grid=as.factor(grid), season=as.factor(season),month=as.factor(month), sex=as.factor(sex), age_dental=as.factor(age_dental))
#left_join(modules_table_three_groups %>% filter(asv_core == "Non-core") %>% distinct(host_ID,host_group), by="host_ID") %>%
#mutate(module = factor(ifelse(host_group==1,1,0)))
distance_matrix <- vegdist(data_asv_mat, method = "bray")
# Perform PERMANOVA
permanova_result <- adonis2(distance_matrix ~ grid+season+village, data = hosts, permutations = 999)
print(knitr::kable(permanova_result[,4:5]))| F | Pr(>F) | |
|---|---|---|
| grid | 2.331399 | 0.001 |
| season | 4.768973 | 0.001 |
| village | 5.666093 | 0.001 |
| Residual | NA | NA |
| Total | NA | NA |
cat('\n')# PERMANOVA post-hoc
# perm_post_grid <- RVAideMemoire::pairwise.perm.manova(distance_matrix, fact = hosts$grid,
# test = "bonferroni", nperm = 999, progress = FALSE)$p.value %>% as.data.frame()
# print(knitr::kable(perm_post_grid))
# cat('\n')
#NMDS
nmds_result <- metaMDS(distance_matrix, distance = "bray", k = 2, trace = F)
# preparing data for plotting
nmds_plot <- nmds_result$points %>%
as.data.frame() %>%
rownames_to_column("host_ID") %>%
mutate(host_ID = as.double(host_ID)) %>%
left_join(hosts, by="host_ID")
# plotting
g1 <- nmds_plot %>%
ggplot( aes(MDS1, MDS2, color=grid, shape=village)) +
geom_point(size = 2, position=position_jitter(.01)) +
#stat_ellipse(aes(fill=grid), alpha=.1, type='norm',linetype =2, geom="polygon") + ##draws 95% confidence interval ellipses
theme_bw() +
#annotate("text", x=min(nmds_plot$MDS1)+0.04, size=3, y=max(nmds_plot$MDS2), label=paste('Stress =',round(nmds_result$stress,3))) +
labs(x = "NMDS1", y = "NMDS2", title = "(A) Bray-Curtis")
print(g1)set.seed(123)
library(GUniFrac)
rooted_phylo_tree <- phangorn::midpoint(phylo_tree)
distance_matrix <- GUniFrac(data_asv_mat, rooted_phylo_tree, verbose = F)$unifracs[, , "d_UW"]
# Perform weighted UniFrac
unifrac_result <- adonis2(distance_matrix ~ grid+season+village, data = hosts, permutations = 999)
print(knitr::kable(unifrac_result[,4:5]))| F | Pr(>F) | |
|---|---|---|
| grid | 2.644989 | 0.001 |
| season | 6.105313 | 0.001 |
| village | 8.450150 | 0.001 |
| Residual | NA | NA |
| Total | NA | NA |
cat('\n')# PERMANOVA post-hoc
# perm_post_grid <- RVAideMemoire::pairwise.perm.manova(distance_matrix, fact = hosts$grid,
# test = "bonferroni", nperm = 999, progress = FALSE)$p.value %>% as.data.frame()
# print(knitr::kable(perm_post_grid))
# cat('\n')
#NMDS
nmds_result2 <- metaMDS(distance_matrix, k = 2, trace = F)
# preparing data for plotting
nmds_plot2 <- nmds_result2$points %>%
as.data.frame() %>%
rownames_to_column("host_ID") %>%
mutate(host_ID = as.double(host_ID)) %>%
left_join(hosts, by="host_ID")
# plotting
g2 <- nmds_plot2 %>%
ggplot( aes(MDS1, MDS2, color=grid, shape=season)) +
geom_point(size = 2, position=position_jitter(.05)) +
#stat_ellipse(aes(fill=grid), alpha=.1, type='norm',linetype =2, geom="polygon") + ##draws 95% confidence interval ellipses
theme_bw() +
annotate("text", x=min(nmds_plot2$MDS1)+0.15, size=3, y=max(nmds_plot2$MDS2), label=paste('Stress =',round(nmds_result2$stress,3))) +
labs(x = "NMDS1", y = "NMDS2", title = "(B) Weighted UniFrac")
print(g2)# plotting
g1 + g2 + plot_layout(guides='collect') &
theme(legend.position='bottom', legend.spacing.x=unit(0.1, 'cm'))cat('\n','\n')# taking only ASVs with known Genus
data_tax <- data_asv_tax %>%
filter(asv_core != "All") %>%
distinct(host_ID, asv_core, Genus) %>%
filter(!is.na(Genus))
# known ASVs
asv_genus <- data_asv_tax %>%
distinct(asv_ID, Genus) %>%
mutate(name = is.na(Genus))
print(paste("Percentege of known ASVs:", round(table(asv_genus$name)[1]/nrow(asv_genus)*100, 2)))[1] “Percentege of known ASVs: 41.26”
cat('\n')# how many Genus?
print(paste("Number of genera:", length(unique(data_tax$Genus))))[1] “Number of genera: 119”
cat('\n')# making unique samples (host_ID+asv_core)
data_tax_meta <- data_tax %>%
arrange(host_ID) %>%
unite(col="sample", host_ID:asv_core, remove = FALSE)
data_tax_meta2 <- data_tax_meta %>% distinct(sample, asv_core)
data_tax_summary <- data_tax %>%
count(asv_core, Genus) %>%
spread(Genus, n, fill = 0)
# transforming to matrix
data_tax_mat <- data_tax_meta %>%
select(sample, Genus) %>%
mutate(n=1) %>%
spread(Genus, n, fill = 0) %>%
column_to_rownames("sample") %>%
as.matrix()
# Perform PERMANOVA
permanova_result <- adonis2(data_tax_mat ~ asv_core, data = data_tax_meta2, method = "jaccard", permutations = 999)
print(paste("F =", permanova_result$F[1]))[1] “F = 155.763265711577”
cat('\n')print(paste("p-value =", permanova_result$`Pr(>F)`[1]))[1] “p-value = 0.001”
cat('\n')# PCA
tax_pca <- prcomp(data_tax_mat)
# explained variance
ex_var <- tax_pca$sdev ^2
prop_ex_var <- ex_var/sum(ex_var)*100
loadings <- as.data.frame(tax_pca$rotation[, 1:2]) %>% rownames_to_column("Genus") %>%
mutate(PC1_abs = abs(PC1), PC2_abs = abs(PC2))
tax_scores <- as.data.frame(tax_pca$x[,1:2]) %>% rownames_to_column("sample") %>%
left_join(data_tax_meta2, by="sample")
loading_top <- loadings %>%
slice_max(n = 2, order_by = PC1_abs) %>%
bind_rows(loadings %>% slice_max(n = 2, order_by = PC2_abs))
g1 <- tax_scores %>%
ggplot(aes(PC1, PC2, color=asv_core)) +
geom_hline(yintercept = 0, linetype = "dotted", color = "grey") +
geom_vline(xintercept = 0, linetype = "dotted", color = "grey") +
geom_point(color=group.colors[tax_scores$asv_core]) +
geom_segment(data = loading_top, inherit.aes = FALSE, aes(x = 0, y = 0, xend = PC1*6, yend = PC2*6),
arrow = arrow(length = unit(0.15, "cm"))) +
annotate("label", x = (loading_top$PC1*6), y = (loading_top$PC2*6),
label = loading_top$Genus, size=3, label.size=0.1, label.padding=unit(0.05, "lines")) +
theme_bw() +
theme(panel.border = element_rect(colour = "black", size=1), panel.grid = element_blank())+
scale_fill_manual(values=group.colors) +
labs(x = paste("PC1 (",round(prop_ex_var[1],2),"%)", sep = ""), y = paste("PC2 (",round(prop_ex_var[2],2),"%)", sep = ""),
title = "(A) Genus", color = "ASV Group")# taking only ASVs with known Family
data_tax <- data_asv_tax %>%
filter(asv_core != "All") %>%
distinct(host_ID, asv_core, Family) %>%
filter(!is.na(Family))
# known ASVs
asv_genus <- data_asv_tax %>%
distinct(asv_ID, Family) %>%
mutate(name = is.na(Family))
print(paste("Percentege of known ASVs:", round(table(asv_genus$name)[1]/nrow(asv_genus)*100, 2)))[1] “Percentege of known ASVs: 90.72”
cat('\n')# how many Family?
print(paste("Number of families:", length(unique(data_tax$Family))))[1] “Number of families: 55”
cat('\n')# making unique samples (host_ID+asv_core)
data_tax_meta <- data_tax %>%
arrange(host_ID) %>%
unite(col="sample", host_ID:asv_core, remove = FALSE)
data_tax_meta2 <- data_tax_meta %>% distinct(sample, asv_core)
data_tax_summary <- data_tax %>%
count(asv_core, Family) %>%
spread(Family, n, fill = 0)
# transforming to matrix
data_tax_mat <- data_tax_meta %>%
select(sample, Family) %>%
mutate(n=1) %>%
spread(Family, n, fill = 0) %>%
column_to_rownames("sample") %>%
as.matrix()
# Perform PERMANOVA
permanova_result <- adonis2(data_tax_mat ~ asv_core, data = data_tax_meta2, method = "jaccard", permutations = 999)
print(paste("F =", permanova_result$F[1]))[1] “F = 188.830216433384”
cat('\n')print(paste("p-value =", permanova_result$`Pr(>F)`[1]))[1] “p-value = 0.001”
cat('\n')# PCA
tax_pca <- prcomp(data_tax_mat)
# explained variance
ex_var <- tax_pca$sdev ^2
prop_ex_var <- ex_var/sum(ex_var)*100
loadings <- as.data.frame(tax_pca$rotation[, 1:2]) %>% rownames_to_column("Family") %>%
mutate(PC1_abs = abs(PC1), PC2_abs = abs(PC2))
tax_scores <- as.data.frame(tax_pca$x[,1:2]) %>% rownames_to_column("sample") %>%
left_join(data_tax_meta2, by="sample")
loading_top <- loadings %>%
slice_max(n = 2, order_by = PC1_abs) %>%
bind_rows(loadings %>% slice_max(n = 2, order_by = PC2_abs))
g2 <- tax_scores %>%
ggplot(aes(PC1, PC2, color=asv_core)) +
geom_hline(yintercept = 0, linetype = "dotted", color = "grey") +
geom_vline(xintercept = 0, linetype = "dotted", color = "grey") +
geom_point(color=group.colors[tax_scores$asv_core]) +
geom_segment(data = loading_top, inherit.aes = FALSE, aes(x = 0, y = 0, xend = PC1*4, yend = PC2*4),
arrow = arrow(length = unit(0.15, "cm"))) +
annotate("label", x = (loading_top$PC1*4), y = (loading_top$PC2*4),
label = loading_top$Family, size=3, label.size=0.1, label.padding=unit(0.05, "lines")) +
scale_x_continuous(limits = c(-2, 2.7)) +
theme_bw() +
theme(panel.border = element_rect(colour = "black", size=1), panel.grid = element_blank())+
scale_fill_manual(values=group.colors) +
labs(x = paste("PC1 (",round(prop_ex_var[1],2),"%)", sep = ""), y = paste("PC2 (",round(prop_ex_var[2],2),"%)", sep = ""),
title = "(B) Family", color = "ASV Group")# plotting
g1 + g2 + plot_layout(guides='collect') &
theme(legend.position='bottom')cat('\n','\n')# loop for three groups
for (i in 1:4) {
cat('##',core_names[i],'{.tabset}','\n','\n')
cat('### ASVs degree distribution','\n')
print(asv_degree_distribution_three_groups[[i]])
cat('\n','\n')
# calculating connectance
connectance_data <- modules_table_three_groups %>%
filter(asv_core == core_names[i])
cat('No. of hosts: ', length(unique(connectance_data$host_ID)) ,'\n','\n')
cat('No. of ASVs: ', length(unique(connectance_data$asv_ID)) ,'\n','\n')
cat('Connectance: ', nrow(connectance_data) / (length(unique(connectance_data$host_ID)) * length(unique(connectance_data$asv_ID))) ,'\n','\n')
cat('### Modules','\n')
cat('The color indicates number of host individuals in the module / total number of hosts in the whole grid [%]','\n','\n')
print(modules_three_groups[[i]])
cat('\n','\n')
cat('### Modules size','\n')
print(modules_size_three_groups[[i]])
cat('\n','\n')
cat('### No. of land uses','\n')
print(modules_grid_three_groups[[i]])
cat('\n','\n')
}No. of hosts: 855
No. of ASVs: 1951
Connectance: 0.05952383
The color indicates number of host individuals in the module / total number of hosts in the whole grid [%]
No. of hosts: 853
No. of ASVs: 103
Connectance: 0.3439147
The color indicates number of host individuals in the module / total number of hosts in the whole grid [%]
No. of hosts: 855
No. of ASVs: 1188
Connectance: 0.06003308
The color indicates number of host individuals in the module / total number of hosts in the whole grid [%]
No. of hosts: 829
No. of ASVs: 660
Connectance: 0.0148006
The color indicates number of host individuals in the module / total number of hosts in the whole grid [%]
cat('## Modules','\n') modules_three_groups[[1]]+modules_three_groups[[2]]+modules_three_groups[[3]]+modules_three_groups[[4]] + plot_layout(nrow=2,ncol=2,guides='collect',axes = "collect_y",axis_titles = "collect") &
theme(legend.position='bottom') cat('\n','\n')library(ggsignif)
#library(ggbreak)
modules_sizes <- modules_table_three_groups %>%
group_by(asv_core, host_group) %>%
summarise(n = n_distinct(host_ID))
# summary
cat('Summary','\n')Summary
modules_size_summary <- modules_sizes %>%
group_by(asv_core) %>%
summarise(mean = mean(n), sd = sd(n))
print(knitr::kable(modules_size_summary))| asv_core | mean | sd |
|---|---|---|
| All | 13.359375 | 50.491982 |
| Core | 77.545455 | 116.355802 |
| Non-core | 10.961538 | 45.098001 |
| Rare | 4.791907 | 2.617593 |
cat('\n')# ANOVA between groups
cat('ANOVA','\n')ANOVA
anova_result <- aov(n ~ asv_core, data = modules_sizes)
print(paste("F =", summary(anova_result)[[1]][["F value"]][1]))[1] “F = 13.1556781501674”
cat('\n')print(paste("p-value =", summary(anova_result)[[1]][["Pr(>F)"]][1]))[1] “p-value = 4.00172312122157e-08”
cat('\n')# Perform Tukey's HSD post hoc test
cat('Tukey post-hoc','\n')Tukey post-hoc
tukey_result <- TukeyHSD(anova_result)
# Extract the test statistics (q values) from the Tukey HSD results
tukey_q <- tukey_result$asv_core[, "diff"] / tukey_result$asv_core[, "lwr"]
tukey_result <- cbind(tukey_result$asv_core, q_value = tukey_q) %>%
as.data.frame() %>%
rownames_to_column("comp")
print(knitr::kable(tukey_result))| comp | diff | lwr | upr | p adj | q_value |
|---|---|---|---|---|---|
| Core-All | 64.186080 | 32.54283 | 95.829334 | 0.0000017 | 1.9723573 |
| Non-core-All | -2.397836 | -18.74881 | 13.953141 | 0.9814720 | 0.1278927 |
| Rare-All | -8.567467 | -22.75144 | 5.616508 | 0.4031457 | 0.3765681 |
| Non-core-Core | -66.583916 | -97.80789 | -35.359941 | 0.0000004 | 0.6807622 |
| Rare-Core | -72.753547 | -102.89932 | -42.607779 | 0.0000000 | 0.7070363 |
| Rare-Non-core | -6.169631 | -19.39182 | 7.052558 | 0.6240981 | 0.3181564 |
cat('\n') g1 <- modules_sizes %>%
ggplot(aes(x=asv_core, y=n, fill=asv_core)) +
geom_boxplot(outliers = FALSE, alpha = 0.9) +
geom_jitter(color="black", size=1) +
theme_classic() +
#scale_y_continuous(limits = c(0, 145)) +
#scale_y_break(c(50,130), space=0.4, ticklabels = c(130,140))+
geom_signif(comparisons = list(c("Core", "Non-core"),c("Core", "Rare"),c("Non-core", "Rare")),
annotations = c("***","***","ns"), y_position = c(90,100,80), tip_length = 0) +
geom_point(data = modules_size_summary, aes(x=asv_core, y = mean), position = position_dodge(width = 0.75), size = 2, color="#d62828",alpha=0.8, show.legend=F) +
theme(axis.text = element_text(size = 11, color = 'black'), title = element_text(size = 15), legend.position = "none") +
scale_fill_manual(values=group.colors) +
labs(x="ASV Group", y="Module Size", title = "(A)")modules_grids <- modules_table_three_groups %>%
group_by(asv_core, host_group) %>%
summarise(n = n_distinct(grid))
# summary
cat('Summary','\n')Summary
modules_grids_summary <- modules_grids %>%
group_by(asv_core) %>%
summarise(mean = mean(n), sd = sd(n))
print(knitr::kable(modules_grids_summary))| asv_core | mean | sd |
|---|---|---|
| All | 2.609375 | 1.890806 |
| Core | 6.000000 | 1.264911 |
| Non-core | 2.910256 | 1.722169 |
| Rare | 3.028902 | 1.193138 |
cat('\n')# ANOVA between groups
cat('ANOVA','\n')ANOVA
anova_result <- aov(n ~ asv_core, data = modules_grids)
print(paste("F =", summary(anova_result)[[1]][["F value"]][1]))[1] “F = 16.4393177523047”
cat('\n')print(paste("p-value =", summary(anova_result)[[1]][["Pr(>F)"]][1]))[1] “p-value = 5.790377827188e-10”
cat('\n')# Perform Tukey's HSD post hoc test
cat('Tukey post-hoc','\n')Tukey post-hoc
tukey_result <- TukeyHSD(anova_result)
# Extract the test statistics (q values) from the Tukey HSD results
tukey_q <- tukey_result$asv_core[, "diff"] / tukey_result$asv_core[, "lwr"]
tukey_result <- cbind(tukey_result$asv_core, q_value = tukey_q) %>%
as.data.frame() %>%
rownames_to_column("comp")
print(knitr::kable(tukey_result))| comp | diff | lwr | upr | p adj | q_value |
|---|---|---|---|---|---|
| Core-All | 3.3906250 | 2.1350415 | 4.6462085 | 0.0000000 | 1.5880839 |
| Non-core-All | 0.3008814 | -0.3479146 | 0.9496774 | 0.6287771 | -0.8648139 |
| Rare-All | 0.4195267 | -0.1432841 | 0.9823376 | 0.2195791 | -2.9279367 |
| Non-core-Core | -3.0897436 | -4.3286904 | -1.8507968 | 0.0000000 | 0.7137825 |
| Rare-Core | -2.9710983 | -4.1672625 | -1.7749340 | 0.0000000 | 0.7129616 |
| Rare-Non-core | 0.1186453 | -0.4060024 | 0.6432931 | 0.9368576 | -0.2922281 |
cat('\n') g2 <- modules_grids %>%
ggplot(aes(x=asv_core, y=n, fill=asv_core)) +
geom_boxplot(outliers = FALSE, alpha = 0.9) +
geom_jitter(color="black", size=1) +
theme_classic() +
scale_y_continuous(limits = c(0, 9), breaks = seq(0,8,by=2)) +
geom_signif(comparisons = list(c("Core", "Non-core"),c("Core", "Rare"),c("Non-core", "Rare")),
annotations = c("***","***","ns"), y_position = c(7.7,8.5,7), tip_length = 0) +
geom_point(data = modules_grids_summary, aes(x=asv_core, y = mean), position = position_dodge(width = 0.75), size = 2, color="#d62828",alpha=0.8, show.legend=F) +
theme(axis.text = element_text(size = 11, color = 'black'), title = element_text(size = 15), legend.position = "none") +
scale_fill_manual(values=group.colors) +
labs(x="ASV Group", y="No. of Land Uses", title = "(B)")modules_per_grid <- modules_table_three_groups %>%
group_by(asv_core, grid) %>%
summarise(n = n_distinct(host_group))
# summary
cat('Summary','\n')Summary
modules_per_grid_summary <- modules_per_grid %>%
group_by(asv_core) %>%
summarise(mean = mean(n), sd = sd(n))
print(knitr::kable(modules_per_grid_summary))| asv_core | mean | sd |
|---|---|---|
| All | 23.857143 | 8.254869 |
| Core | 9.428571 | 1.618347 |
| Non-core | 32.428571 | 13.339879 |
| Rare | 74.857143 | 29.390637 |
cat('\n')# ANOVA between groups
cat('ANOVA','\n')ANOVA
anova_result <- aov(n ~ asv_core, data = modules_per_grid)
print(paste("F =", summary(anova_result)[[1]][["F value"]][1]))[1] “F = 19.9094294397124”
cat('\n')print(paste("p-value =", summary(anova_result)[[1]][["Pr(>F)"]][1]))[1] “p-value = 1.06302472869455e-06”
cat('\n')# Perform Tukey's HSD post hoc test
cat('Tukey post-hoc','\n')Tukey post-hoc
tukey_result <- TukeyHSD(anova_result)
# Extract the test statistics (q values) from the Tukey HSD results
tukey_q <- tukey_result$asv_core[, "diff"] / tukey_result$asv_core[, "lwr"]
tukey_result <- cbind(tukey_result$asv_core, q_value = tukey_q) %>%
as.data.frame() %>%
rownames_to_column("comp")
print(knitr::kable(tukey_result))| comp | diff | lwr | upr | p adj | q_value |
|---|---|---|---|---|---|
| Core-All | -14.428571 | -39.019830 | 10.16269 | 0.3875613 | 0.3697754 |
| Non-core-All | 8.571429 | -16.019830 | 33.16269 | 0.7721659 | -0.5350512 |
| Rare-All | 51.000000 | 26.408742 | 75.59126 | 0.0000382 | 1.9311787 |
| Non-core-Core | 23.000000 | -1.591258 | 47.59126 | 0.0726252 | -14.4539686 |
| Rare-Core | 65.428571 | 40.837313 | 90.01983 | 0.0000008 | 1.6021762 |
| Rare-Non-core | 42.428571 | 17.837313 | 67.01983 | 0.0004184 | 2.3786414 |
cat('\n') g3 <- modules_per_grid %>%
ggplot(aes(x=asv_core, y=n, fill=asv_core)) +
geom_boxplot(outliers = FALSE, alpha = 0.9) +
geom_jitter(color="black", size=1) +
theme_classic() +
scale_y_continuous(limits = c(0, 60)) +
geom_signif(comparisons = list(c("Core", "Non-core"),c("Core", "Rare"),c("Non-core", "Rare")),
annotations = c("***","***","ns"), y_position = c(45,55,50), tip_length = 0) +
geom_point(data = modules_per_grid_summary, aes(x=asv_core, y = mean), position = position_dodge(width = 0.75), size = 2, color="#d62828",alpha=0.8, show.legend=F) +
theme(axis.text = element_text(size = 11, color = 'black'), title = element_text(size = 15), legend.position = "none") +
scale_fill_manual(values=group.colors) +
labs(x="ASV Group", y="No. of Modules per Land Use", title = "(C)")# plotting
g1+g2+g3 + plot_layout(axis_titles = "collect_x")# combining final results
assembly_final <- assembly_three_groups %>%
mutate(same_module = ifelse(host_group1==host_group2, "Same","Different"))
# renaming the process
assembly_final %<>% mutate(process = case_when(process=="Heterogeneous.Selection" ~ "Heterogeneous Selection",
process=="Homogeneous.Selection" ~ "Homogeneous Selection",
process=="Dispersal.Limitation" ~ "Dispersal Limitation",
process=="Homogenizing.Dispersal" ~ "Homogenizing Dispersal",
.default = "Drift"))
# summary
assembly_summary <- assembly_final %>%
count(asv_core, same_module, process)
assembly_summary_total <- assembly_summary %>%
group_by(asv_core) %>%
summarise(n_total = sum(n))
assembly_summary_total_module <- assembly_summary %>%
group_by(asv_core, same_module) %>%
summarise(n_total = sum(n))
#plotting process ratio between groups
g1 <- assembly_summary %>%
group_by(asv_core, process) %>%
summarise(n = sum(n)) %>%
left_join(assembly_summary_total, by=c("asv_core")) %>%
mutate(n_p = n/n_total) %>%
ggplot(aes(fill=process, x=asv_core, y=n_p)) +
geom_bar(position="fill", stat="identity") +
scale_y_continuous(labels = scales::percent) +
geom_text(aes(label = paste0(round(n_p*100,1),"%")),
position = position_stack(vjust = 0.5), size = 3)+
theme_bw() +
theme(axis.text = element_text(size = 11, color = 'black', angle = 90, vjust = 0.5, hjust=1), title = element_text(size = 14),
strip.text = element_text(size=12, color = 'black'), strip.background = element_rect(color = "grey80", size = 1),
panel.grid = element_blank(), panel.background = element_rect(colour = "black")) +
scale_fill_manual(values=c("#a7c957","#f2e8cf","#f07167","#83c5be")) +
labs(x="ASVs Groups", y="Percentage")
print(g1)#plotting process ratio between modules
g2 <- assembly_summary %>%
left_join(assembly_summary_total_module, by=c("asv_core","same_module")) %>%
mutate(n_p = n/n_total) %>%
ggplot(aes(fill=process, x=same_module, y=n_p)) +
geom_bar(position="fill", stat="identity") +
facet_wrap(~asv_core) +
scale_y_continuous(labels = scales::percent) +
geom_text(aes(label = paste0(round(n_p*100,1),"%")),
position = position_stack(vjust = 0.5), size = 3)+
theme_bw() +
theme(axis.text = element_text(size = 9, color = 'black', angle = 90, vjust = 0.5, hjust=1), title = element_text(size = 14),
strip.text = element_text(size=12, color = 'black'), strip.background = element_rect(color = "grey80", size = 1),
panel.grid = element_blank(), panel.background = element_rect(colour = "black")) +
scale_fill_manual(values=c("#a7c957","#f2e8cf","#f07167","#83c5be")) +
labs(x="", y="Percentage")
print(g2)# reading grid similarity results
grids_similarity_attr <- read_csv("../../data/data_processed/village_summary_new.csv")
rownames(grids_similarity_attr) <- grids_similarity_attr$grid_village
# correlations between variables
print(psych::pairs.panels(grids_similarity_attr %>% select(-grid_village,-village,-grid), ellipses = F, lm = F))NULL
cat('\n','\n')# RDA for modules
anova_model_all <- NULL
anova_rda_all <- NULL
score_modules_top_all <- NULL
rda_figs <- list()
set.seed(123)
# loop for three asv groups
for (i in 1:4) {
modules_similarity2 <- modules_similarity2_three_groups[[i]]
grid_names <- rownames(modules_similarity2)
# filtering matrix to the existing grids
grids_similarity_attr2 <- grids_similarity_attr %>% filter(grid_village %in% grid_names & !grepl("village", grid_village))
modules_similarity2 <- modules_similarity2[rownames(grids_similarity_attr2),]
n=data_asv %>% group_by(village,grid) %>% summarise(n=n_distinct(host_ID))
grids_similarity_attr2 %<>% left_join(n, by=c("village","grid"))
# tb-RDA
# hellinger transformation
modules_similarity2_hell <- decostand(modules_similarity2, 'hell')
rda_result <- rda(modules_similarity2_hell ~ veg_PC1+veg_PC2+dist_to_village+elevation+Condition(n), data=grids_similarity_attr2, na.action = na.omit)
# **Condition(n) - control for the number of host in each village-grid.
# variation explained
constrained_eig <- t(as.data.frame(rda_result$CCA$eig/rda_result$tot.chi*100)) # for RDAs
# unconstrained_eig <- rda_result$CA$eig/rda_result$tot.chi*100
# expl_var <- c(constrained_eig, unconstrained_eig)
# barplot (expl_var[1:20], col = c(rep ('red', length (constrained_eig)), rep ('black', length (unconstrained_eig))),
# las = 2, ylab = '% variation')
R2.obs <- RsquareAdj (rda_result)$adj.r.squared
# significance test for the whole model
anova_model <- anova(rda_result)[1,] %>%
mutate(R2.adj = R2.obs, asv_core = core_names[i])
anova_model_all <- rbind(anova_model_all, anova_model)
# significance test for all variables
anova_rda <- anova(rda_result, by = "margin", permutations = 999) %>%
mutate(asv_core = core_names[i])
anova_rda$`Pr(>F)` <- p.adjust (anova_rda$`Pr(>F)`, method = 'holm')
anova_rda_all <- rbind(anova_rda_all, anova_rda)
# scores
score_modules <- as.data.frame(vegan::scores(rda_result)$species) %>%
mutate(RDA1_abs = abs(RDA1), RDA2_abs = abs(RDA2))
score_modules_top <- score_modules %>%
slice_max(n = 5, order_by = RDA1_abs) %>%
bind_rows(score_modules %>% slice_max(n = 5, order_by = RDA2_abs)) %>%
rownames_to_column("host_group") %>%
mutate(asv_core = core_names[i])
score_modules_top_all <- rbind(score_modules_top_all, score_modules_top)
# plotting
# samples
rda_samples <- as.data.frame(scores(rda_result)$sites) %>%
rownames_to_column("grid_village") %>%
left_join(grids_similarity_attr2[c("grid_village","village","grid")], by="grid_village") %>%
mutate(asv_core = core_names[i])
# variables
rda_var <- as.data.frame(rda_result$CCA$biplot)
rownames(rda_var) <- c("Veg_1","Veg_2","Distance","Elevation")
# plot
g <- rda_samples %>%
ggplot(aes(RDA1, RDA2)) +
geom_hline(yintercept = 0, linetype = "dotted", color = "grey") +
geom_vline(xintercept = 0, linetype = "dotted", color = "grey") +
geom_point(size = 4, alpha=0.9, color=group.colors[core_names[i]]) +
geom_segment(data = rda_var, inherit.aes = FALSE, aes(x = 0, y = 0, xend = (RDA1*0.85), yend = (RDA2*0.85)),
arrow = arrow(length = unit(0.15, "cm"))) +
geom_label(data = rda_var,inherit.aes = FALSE, aes(x = RDA1, y = RDA2, label=rownames(rda_var)),size=3, alpha=0.5)+
theme_bw() +
coord_cartesian(xlim = range(rda_samples$RDA1) * c(1.4, 1.3),
ylim = range(rda_samples$RDA2) * c(1.4, 1.3),
clip = 'off') +
theme(panel.border = element_rect(colour = "black", size=1), panel.grid = element_blank())+
labs(x = paste("RDA1 (",round(constrained_eig[1],2),"%)", sep = ""), y = paste("RDA2 (",round(constrained_eig[2],2),"%)", sep = ""),
title = paste(core_names[i]), color = "Land Use", shape = "Village")
rda_figs <- append(rda_figs, list(g))
}
print(knitr::kable(anova_model_all))| Df | Variance | F | Pr(>F) | R2.adj | asv_core | |
|---|---|---|---|---|---|---|
| Model | 4 | 0.1569590 | 1.207555 | 0.089 | 0.0522835 | All |
| Model1 | 4 | 0.0749485 | 1.205524 | 0.229 | 0.0491638 | Core |
| Model2 | 4 | 0.2529385 | 1.613124 | 0.001 | 0.1398641 | Non-core |
| Model3 | 4 | 0.2319469 | 1.197271 | 0.013 | 0.0484911 | Rare |
cat('\n','\n')print(knitr::kable(anova_rda_all))| Df | Variance | F | Pr(>F) | asv_core | |
|---|---|---|---|---|---|
| veg_PC1 | 1 | 0.0512348 | 1.5766882 | 0.140 | All |
| veg_PC2 | 1 | 0.0292258 | 0.8993876 | 1.000 | All |
| dist_to_village | 1 | 0.0386276 | 1.1887177 | 0.702 | All |
| elevation | 1 | 0.0318644 | 0.9805881 | 1.000 | All |
| Residual | 11 | 0.3574472 | NA | NA | All |
| veg_PC11 | 1 | 0.0246650 | 1.5869155 | 0.532 | Core |
| veg_PC21 | 1 | 0.0097680 | 0.6284648 | 1.000 | Core |
| dist_to_village1 | 1 | 0.0175965 | 1.1321409 | 1.000 | Core |
| elevation1 | 1 | 0.0168504 | 1.0841379 | 1.000 | Core |
| Residual1 | 11 | 0.1709699 | NA | NA | Core |
| veg_PC12 | 1 | 0.0828671 | 2.1139511 | 0.004 | Non-core |
| veg_PC22 | 1 | 0.0379223 | 0.9674034 | 0.532 | Non-core |
| dist_to_village2 | 1 | 0.0682650 | 1.7414498 | 0.028 | Non-core |
| elevation2 | 1 | 0.0753143 | 1.9212762 | 0.009 | Non-core |
| Residual2 | 11 | 0.4312013 | NA | NA | Non-core |
| veg_PC13 | 1 | 0.0548070 | 1.1316180 | 0.390 | Rare |
| veg_PC23 | 1 | 0.0518055 | 1.0696447 | 0.390 | Rare |
| dist_to_village3 | 1 | 0.0613099 | 1.2658852 | 0.126 | Rare |
| elevation3 | 1 | 0.0658914 | 1.3604824 | 0.032 | Rare |
| Residual3 | 11 | 0.5327565 | NA | NA | Rare |
cat('\n','\n')rda_figs[[1]] + rda_figs[[2]] + rda_figs[[3]] + rda_figs[[4]] + plot_layout(ncol=2, nrow=2) & theme(legend.position='none')cat('\n','\n')library(caret)
library(yardstick)
library(randomForest)
library(ROSE)
# Assume df is your dataframe and "group" is the binary target column
df <- read_csv("../data/data_processed/ML_module/ML_rattus_three_villages_core.csv") %>%
select(-host_ID.x,-host_ID.y) %>%
mutate(module=as.factor(module), grid=as.factor(grid), sex=as.factor(sex), season=as.factor(season))
# Set seed for reproducibility
set.seed(123)
# Split the data into training and test sets
train_index <- createDataPartition(df$module, p = 0.8, list = FALSE)
train_df <- df[train_index, ]
test_df <- df[-train_index, ]
# Define the control for cross-validation
control <- trainControl(method = "cv", number = 5)
# Create a function for under-sampling
undersample <- function(df) {
# Separate features and target
features <- df[, -which(names(df) == "module")]
target <- df$module
# Combine into a new dataframe
data <- cbind(features, module = target)
# Perform under-sampling using ROSE package
balanced_data <- ovun.sample(module ~ ., data = data, method = "under", seed = 1)$data
return(balanced_data)
}
# Apply under-sampling to the training dataframe
balanced_train_df <- undersample(train_df)
# Separate features and target for the balanced training dataset
X_train <- balanced_train_df[, -which(names(balanced_train_df) == "module")]
y_train <- as.factor(balanced_train_df$module)
# Define the training model using random forest
rf_model <- train(X_train, y_train,
method = "rf",
trControl = control,
metric = "Accuracy")
# Print the model details
print(rf_model$finalModel)
# Evaluate the model on the test set
X_test <- test_df[, -which(names(test_df) == "module")]
y_test <- as.factor(test_df$module)
predictions_prob <- predict(rf_model$finalModel, X_test, type = "prob")[,2]
predictions_class <- predict(rf_model$finalModel, X_test)
# Convert predictions to a data frame for yardstick
results <- data.frame(
truth = y_test,
predicted_prob = predictions_prob,
predicted_class = predictions_class
)
# Calculate precision
precision_val <- yardstick::precision(results, truth = truth, estimate = predicted_class)
print(paste("Precision:", precision_val$.estimate))
# Calculate recall
recall_val <- yardstick::recall(results, truth = truth, estimate = predicted_class)
print(paste("Recall:", recall_val$.estimate))
# Calculate F1 score
f1_val <- yardstick::f_meas(results, truth = truth, estimate = predicted_class)
print(paste("F1 Score:", f1_val$.estimate))
# Plot ROC Curve
auc_roc_score <- yardstick::roc_auc(results, truth, predicted_prob)
roc_obj <- yardstick::roc_curve(results, truth, predicted_prob) %>%
ggplot(aes(x = 1 - specificity, y = sensitivity)) +
geom_path(color = "blue") +
geom_abline(lty = 3) +
coord_equal() +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5))+
labs(title = "ROC Curve", subtitle = paste("AUC = ", round(auc_roc_score$.estimate,3)))
print(roc_obj)
cat('\n','\n')
# Plot Precision-Recall Curve
auc_pr_score <- yardstick::pr_auc(results, truth = truth, predicted_prob)
pr_obj <- yardstick::pr_curve(results, truth = truth, predicted_prob) %>%
ggplot(aes(x = recall, y = precision)) +
geom_path(color = "blue") +
coord_equal() +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5))+
labs(title = "Precision-Recall curve", subtitle = paste("AUC = ", round(auc_pr_score$.estimate,3)))
print(pr_obj)
cat('\n','\n')
# Feature importance analysis
# Extract the final random forest model
final_rf <- rf_model$finalModel
# Calculate feature importance
importance <- importance(final_rf)
var_importance <- data.frame(Variables = rownames(importance),
Importance = importance[, 1])
# Plot feature importance
ggplot(var_importance, aes(x = reorder(Variables, Importance), y = Importance)) +
geom_bar(stat = "identity") +
coord_flip() +
ggtitle("Feature Importance") +
xlab("Variables") +
ylab("Importance (Mean Decrease in Gini)") +
theme_minimal()